In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
In [4]:
from munging import session
from munging import transform
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
In [28]:
## load train data
data = pd.read_csv("data/predicting_biological_response/train.csv")
data.head(3)
Out[28]:
In [29]:
data_index, submission_index = train_test_split(range(data.shape[0]), test_size = 0.3, random_state = 0)
submission_data = data.iloc[submission_index, :]
data = data.iloc[data_index, :]
In [30]:
print data.shape, submission_data.shape
In [31]:
## create session for exploration
dsession = session.Session(data, target_feature = "Activity", test_frac = 0.3, random_state = 0)
transformers = []
print dsession.get_parameters()
In [32]:
## find categorical and numerical features
numerical_feats = dsession.get_features_of(dsession.is_numerical_feature)
categorical_feats = dsession.get_features_of(dsession.is_categorical_feature)
print len(numerical_feats)
print len(categorical_feats)
In [33]:
## Know what you are dealing with
pd.value_counts(data.Activity)
Out[33]:
In [34]:
## missing values
na_feats = dsession.get_features_of(dsession.is_na_feature)
print na_feats
In [16]:
## no need to remove heavy missing value features
In [17]:
## no need to impute missing values
In [35]:
## non-informative features and remove them
noninformative_feats = dsession.get_features_of(dsession.is_noninformative_feature)
print len(noninformative_feats)
remover = dsession.remove_features(feature_names = noninformative_feats)
transformers.append(remover)
In [36]:
## skewed features and evenize them
skewed_feats = dsession.get_features_of(dsession.is_skewed_numerical_feature)
print len(skewed_feats)
evenizer = dsession.evenize_skew_features(skewed_feats, auto_remove=True)
transformers.append(evenizer)
print len(dsession.get_features_of(dsession.is_skewed_numerical_feature))
In [37]:
## whiten features
## and if the ranges of features are not too different, it may not be necessary
## alternatively, you can use the min-max scaler
whitener = dsession.whiten_features(auto_remove=True)
transformers.append(whitener)
#scaler = dsession.minmax_scale_features(auto_remove=True)
#transformers.append(scaler)
In [21]:
## find mutual redundante features
dsession.find_redundant_features()
Out[21]:
In [38]:
## numerize categorical features
numerizer = dsession.numerize_categorical_features(auto_remove=True)
transformers.append(numerizer)
In [23]:
## find redudant features as the numerization of categorical may bring some
dsession.find_redundant_features()
Out[23]:
In [39]:
## rank numerized features
numerized_features = dsession.get_features_of(dsession.is_numerized_from_categorical_feature)
numerized_features_rank = dsession.rank_features(numerized_features,
by = dsession.numerized_feature_auc_metric,
target_value = 0)
for f, s in numerized_features_rank:
print f, s
if s <= 0.55: break
In [40]:
selected_numerized_feats = [f for f, s in numerized_features_rank[:10]]
#selected_numerized_feats = ['D27', 'D1036', 'D1004', 'D979', 'D1089',
# 'D1109', 'D1125', 'D1061', 'D954', 'D1176']
print selected_numerized_feats
In [249]:
## explore useful numerical features
original_numerical_feats = [f for f in dsession.get_features_of(dsession.is_numerical_feature)
if f not in numerized_features]
print len(original_numerical_feats)
In [143]:
dsession.plot_numerical_feature_density(feature_names=original_numerical_feats[:60])
Out[143]:
In [144]:
dsession.plot_numerical_feature_density(feature_names=original_numerical_feats[60:260])
Out[144]:
In [145]:
dsession.plot_numerical_feature_density(feature_names=original_numerical_feats[260:])
Out[145]:
In [43]:
selected_numerical_feats = []
selected_numerical_feats += ["D%i_WHITE" % i for i in [6, 7, 10, 17, 46, 70, 126, 152, 177, 659]]
selected_numerical_feats += ["D%i_LOG1_WHITE" % i for i in [130, 75, 88, 911, 32, 47, 56 ]]
selected_numerical_feats += ["D%i_LOG_WHITE" % i for i in [5]]
print selected_numerical_feats
In [44]:
for f in selected_numerical_feats:
print f, f in dsession.get_all_input_features()
In [45]:
seleted_feats = selected_numerized_feats + selected_numerical_feats
print seleted_feats
selected_train, selected_test = dsession.get_data(selected_features=seleted_feats)
print selected_train.shape, selected_test.shape
selected_train_X, selected_train_y = selected_train.iloc[:, :-1], selected_train.iloc[:, -1]
selected_test_X, selected_test_y = selected_test.iloc[:, :-1], selected_test.iloc[:, -1]
In [50]:
def logloss(ytrue, yhat):
return -np.mean(np.log(np.where(ytrue == 1, yhat, 1-yhat)))
In [54]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
## modify the complexity to make train and test score close enough
tree = DecisionTreeClassifier(max_depth=3)
tree.fit(selected_train_X, selected_train_y)
print roc_auc_score(selected_train_y, tree.predict_proba(selected_train_X)[:, 1])
print roc_auc_score(selected_test_y, tree.predict_proba(selected_test_X)[:, 1])
print tree.score(selected_train_X, selected_train_y)
print tree.score(selected_test_X, selected_test_y)
In [55]:
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score
## modify the complexity to make train and test score close enough
model = SVC(probability=True, kernel = "rbf", gamma = 0.05)
model.fit(selected_train_X, selected_train_y)
print roc_auc_score(selected_train_y, model.predict_proba(selected_train_X)[:, 1])
print roc_auc_score(selected_test_y, model.predict_proba(selected_test_X)[:, 1])
print model.score(selected_train_X, selected_train_y)
print model.score(selected_test_X, selected_test_y)
print logloss(selected_train_y, model.predict_proba(selected_train_X)[:, 1])
print logloss(selected_test_y, model.predict_proba(selected_test_X)[:, 1])
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [78]:
## load submission data
submission_data = pd.read_csv("data/predicting_biological_response/test.csv")
submission_data.head(3)
Out[78]:
In [79]:
## apply accumulated transform steps and apply it to submission data
combiner = dsession.get_transform_combiners(transformers)
transformed_submission = combiner.transform(submission_data)
print transformed_submission.shape
print transformed_submission.head(3)
In [58]:
selected_submission = transformed_submission.loc[:, seleted_feats]
print selected_submission.shape
In [59]:
submission_y = model.predict_proba(selected_submission)[:, 1]
submission_solution = pd.read_csv("data/predicting_biological_response/svm_benchmark.csv")
submission_solution.PredictedProbability = submission_y
submission_solution.to_csv("data/predicting_biological_response/submission1.csv",
header = True, index = False)
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [28]:
## svm benchmark
#feats = ['D27', 'D1036', 'D995', 'D1087', 'D1061', 'D979', 'D1002', 'D1169', 'D996', 'D993',
# 'D6', 'D7', 'D10', 'D17', 'D46', 'D70', 'D126', 'D152', 'D177', 'D659',
# 'D130', 'D131', 'D75', 'D88', 'D911', 'D103', 'D32', 'D47', 'D56', 'D16', 'D5']
feats = submission_data.columns
X = data.iloc[:, 1:].loc[:, feats]
y = data.iloc[:, 0]
from sklearn.svm import SVC
svc = SVC(probability = True)
svc.fit(X, y)
Out[28]:
In [29]:
print roc_auc_score(y, svc.predict_proba(X)[:, 1])
print svc.score(X, y)
In [30]:
submission_yy = svc.predict_proba(submission_data.loc[:, feats])[:, 1]
submission_solution = pd.read_csv("data/predicting_biological_response/svm_benchmark.csv")
submission_solution.PredictedProbability = submission_yy
submission_solution.to_csv("data/predicting_biological_response/benchmark_submission.csv",
header = True, index = False)
In [31]:
logloss(y, svc.predict_proba(X)[:, 1])
Out[31]:
In [ ]:
In [ ]:
In [32]:
from munging import model
In [33]:
models = [RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='gini'),
RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='entropy'),
ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='gini'),
ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='entropy')]
blender = model.ModelBlender(feature_names = data.columns[1:],
target_name = "Activity",
models = models,
blender = LogisticRegression())
In [34]:
blender.fit(data)
Out[34]:
In [35]:
yhat = blender.predict(data)
In [36]:
plt.plot(y, yhat, ".")
Out[36]:
In [ ]:
In [46]:
selected_features = seleted_feats
print selected_features
models = [RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='gini'),
RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='entropy'),
ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='gini'),
ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='entropy')]
blender = dsession.blend_models(models = models,
blender = LogisticRegression(),
feature_names = selected_features)
In [47]:
blender.blender.coef_
Out[47]:
In [51]:
from sklearn.metrics import roc_auc_score
train_matrix, test_matrix = dsession.get_data(selected_features)
trainyhat = blender.predict(train_matrix)
testyhat = blender.predict(test_matrix)
train_y, test_y = train_matrix.Activity, test_matrix.Activity
print 'train auc:', roc_auc_score(train_y, trainyhat)
print 'train accuracy:', np.mean(train_y == (trainyhat>=0.5))
print 'train logloss:', logloss(train_y, trainyhat)
print 'test auc:', roc_auc_score(test_y, testyhat)
print 'test accuracy:', np.mean(test_y == (testyhat>=0.5))
print 'test logloss:', logloss(test_y, testyhat)
In [53]:
## apply accumulated transform steps and apply it to submission data
combiner = dsession.get_transform_combiners(transformers)
transformed_submission = combiner.transform(submission_data)
print transformed_submission.shape
print transformed_submission.head(3)
submission_matrix = transformed_submission.loc[:, selected_features]
submission_y = transformed_submission.Activity
submission_yhat = blender.predict(submission_matrix)
print 'submission auc:', roc_auc_score(submission_y, submission_yhat)
print 'submission accuracy:', np.mean(submission_y == (submission_yhat>=0.5))
print 'submission logloss:', logloss(submission_y, submission_yhat)
In [258]:
## apply accumulated transform steps and apply it to submission data
combiner = dsession.get_transform_combiners(transformers)
transformed_submission = combiner.transform(submission_data)
print transformed_submission.shape
print transformed_submission.head(3)
In [162]:
submission_y = blender.predict(transformed_submission.loc[:, selected_features])
submission_solution = pd.read_csv("data/predicting_biological_response/svm_benchmark.csv")
submission_solution.PredictedProbability = submission_y
submission_solution.to_csv("data/predicting_biological_response/submission2.csv",
header = True, index = False)
blender models are tree models
models = [RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='gini'),
RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='entropy'),
ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='gini'),
ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='entropy')]
Some observations:
1. there is a high risk to numerize categorical features, specially when there are many levels in the categorical feature, because what is observed in the training data (and even validation data) may not well apply to the new data (e.g., the submission data) - this can be seen by observing that the logloss, auc, and accuracy of train and test not change too much along different data transformation, but the submission accuracy drops a lot
2. feature selection based on numerized categorical, and numerical density, however, seems to improve the generality
3. considerting the blender model used here is a combination of tree-based models, it should be the most robust to the data transforms
As a trend in modern data mining activities, a lot of people prefer using an ensemble model to manual feature selection
In [170]:
from IPython.display import display_html
for f in ['D27', 'D1036', 'D1004', 'D979', 'D1089', 'D1109', 'D1125', 'D1061', 'D954', 'D1176']:
print '='*20, f, '='*20
print(pd.value_counts(data[f]))
print(pd.value_counts(submission_data[f]))
In [179]:
dsession.print_categorial_crosstable(feature_names=["D1176"])
Out[179]:
In [ ]:
In [ ]:
In [252]:
selected_features = seleted_feats
print selected_features
blender = SVC(probability = True)
In [253]:
train_matrix, test_matrix = dsession.get_data(selected_features)
train_y, test_y = train_matrix.Activity, test_matrix.Activity
blender.fit(train_matrix.iloc[:, :-1], train_y)
trainyhat = blender.predict_proba(train_matrix.iloc[:, :-1])[:, 1]
testyhat = blender.predict_proba(test_matrix.iloc[:, :-1])[:, 1]
print 'train auc:', roc_auc_score(train_y, trainyhat)
print 'train accuracy:', np.mean(train_y == (trainyhat>=0.5))
print 'train logloss:', logloss(train_y, trainyhat)
print 'test auc:', roc_auc_score(test_y, testyhat)
print 'test accuracy:', np.mean(test_y == (testyhat>=0.5))
print 'test logloss:', logloss(test_y, testyhat)
In [254]:
## apply accumulated transform steps and apply it to submission data
combiner = dsession.get_transform_combiners(transformers)
transformed_submission = combiner.transform(submission_data)
print transformed_submission.shape
print transformed_submission.head(3)
In [255]:
submission_y = (blender.predict_proba(transformed_submission)[:, 1]
if selected_features is None
else blender.predict_proba(transformed_submission.loc[:, selected_features])[:, 1])
submission_solution = pd.read_csv("data/predicting_biological_response/svm_benchmark.csv")
submission_solution.PredictedProbability = submission_y
submission_solution.to_csv("data/predicting_biological_response/submission2.csv",
header = True, index = False)
blender models are SVM
Some observations:
1. similiar pattern to the tree-based blender model
2. it seems that the power of the ensemble model is greater than manual featuer selection
In [ ]: